{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import Spider\n",
    "\n",
    "def highlight(text: str, keyword: str):\n",
    "    idx = text.lower().find(keyword.lower())\n",
    "    result = text\n",
    "    if idx >= 0:\n",
    "        ori_word = text[idx:idx+(len(keyword))]\n",
    "        result = text.replace(ori_word, f'*{ori_word}*')\n",
    "    return result\n",
    "\n",
    "def score(item, keyword: str):\n",
    "    title_score = item[1].lower().count(keyword.lower())\n",
    "    content_score = item[2].lower().count(keyword.lower())\n",
    "    score_result = title_score * 5 + content_score * 3\n",
    "    return score_result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "outputs": [],
   "source": [
    "import jieba\n",
    "\n",
    "class MySearcherC7V0:\n",
    "    \"\"\"\n",
    "    第六次课升级的搜索类版本：\n",
    "    1、避免重复查询相同词\n",
    "    2、尽量减少lower()运行的次数\n",
    "    3、用文档刷词构建缓存\n",
    "    4、去掉search里的文档扫描过程\n",
    "    \"\"\"\n",
    "    def __init__(self, scale: int=1):\n",
    "        self.docs = list()\n",
    "        self.load_data()\n",
    "        if scale > 1:\n",
    "            self.docs *= scale  # 文档规模倍增，用于测试搜索速度\n",
    "        self.cache = dict()\n",
    "        self.vocab = set()\n",
    "        self.lower_preprocess()\n",
    "        self.build_cache()\n",
    "\n",
    "    def load_data(self, data_file_name='./news_list.pkl'):\n",
    "        if os.path.exists(data_file_name):\n",
    "            self.docs = Spider.pickle_load(data_file_name)\n",
    "        else:\n",
    "            Spider.pickle_save(data_file_name)\n",
    "            self.docs = Spider.pickle_load(data_file_name)\n",
    "\n",
    "    def search(self, keyword):\n",
    "        keyword_l = keyword.lower()\n",
    "        if keyword_l in self.cache:\n",
    "            sorted_result = self.cache[keyword_l]\n",
    "        else:\n",
    "            sorted_result = []\n",
    "        return sorted_result\n",
    "\n",
    "    def render_search_result(self, keyword):\n",
    "        count = 0\n",
    "        for item in self.search(keyword):\n",
    "            count += 1\n",
    "            print(f'{count}[{item[1]}] {highlight(self.docs[item[0]][1], keyword)}')\n",
    "\n",
    "    def build_cache(self):\n",
    "        \"\"\"用分词（用文档过滤词库）的方式初始化缓存\"\"\"\n",
    "        doc_id = 0\n",
    "        for doc in self.docs:\n",
    "            doc_word_set = set()\n",
    "            for word in jieba.cut(doc[3]):\n",
    "                if word not in doc_word_set:\n",
    "                    result_item = [doc_id, score(doc, word), doc[1]]\n",
    "                    if word not in self.cache:\n",
    "                        self.cache[word] = [result_item]\n",
    "                    else:\n",
    "                        self.cache[word].append(result_item)\n",
    "                self.vocab.add(word)\n",
    "                doc_word_set.add(word)\n",
    "            doc_id += 1\n",
    "        for word in self.cache:\n",
    "            self.cache[word].sort(key=lambda x: x[1], reverse=True)\n",
    "\n",
    "    def lower_preprocess(self):\n",
    "        for doc_id in range(len(self.docs)):\n",
    "            self.docs[doc_id].append(\n",
    "                (self.docs[doc_id][1] + ' ' + self.docs[doc_id][2]).lower())\n",
    "\n",
    "    def simple_test(self):\n",
    "        assert(len(self.search('tiktok')) > 1)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 4.36 s\n"
     ]
    }
   ],
   "source": [
    "%time searcher_v0 = MySearcherC7V0()\n"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "outputs": [],
   "source": [
    "class MySearcherC7V1(MySearcherC7V0):\n",
    "    \"\"\"\n",
    "    1、初始化过程加载自定义分词词典\n",
    "    2、jieba分词使用搜索引擎模式cut_for_search\n",
    "\n",
    "    3、对查询分词\n",
    "    4、对分词结果取posting\n",
    "    5、对posting lists 取交集\n",
    "    6、将posting保存格式改成只用doc_id\n",
    "    \"\"\"\n",
    "    def __init__(self):\n",
    "        jieba.load_userdict('./dict.txt')\n",
    "        super().__init__()\n",
    "\n",
    "    def build_cache(self):\n",
    "        \"\"\"用分词（用文档过滤词库）的方式初始化缓存（构建索引）\"\"\"\n",
    "        doc_id = 0\n",
    "        for doc in self.docs:\n",
    "            doc_word_set = set()\n",
    "            for word in jieba.cut_for_search(doc[3]):\n",
    "                if word not in doc_word_set:\n",
    "                    result_item = doc_id\n",
    "                    if word not in self.cache:\n",
    "                        self.cache[word] = {result_item}\n",
    "                    else:\n",
    "                        self.cache[word].add(result_item)\n",
    "                    self.vocab.add(word)\n",
    "                    doc_word_set.add(word)\n",
    "            doc_id += 1\n",
    "\n",
    "    def search(self, query):\n",
    "        result = None\n",
    "        for keyword in jieba.cut(query.lower()):\n",
    "            if keyword in self.cache:\n",
    "                if result is None:\n",
    "                    result = self.cache[keyword]\n",
    "                else:\n",
    "                    result = result & self.cache[keyword]\n",
    "            else:\n",
    "                result = set()\n",
    "                break\n",
    "        if result is None:\n",
    "            result = set()\n",
    "        sorted_result = self.rank(query, result)\n",
    "        return sorted_result\n",
    "\n",
    "    def rank(self, query, result_set):\n",
    "        result = list()\n",
    "        for doc_id in result_set:\n",
    "            result.append([doc_id, self.score(self.docs[doc_id], query)])\n",
    "\n",
    "        result.sort(key=lambda x: x[1], reverse=True)\n",
    "        return result\n",
    "\n",
    "    def score(self, item, query):\n",
    "        score = 0\n",
    "        # TODO 对query查询的分词避免重复\n",
    "        for keyword in jieba.cut(query.lower()):\n",
    "            score += item[1].lower().count(keyword.lower())*5 + item[2].lower().count(keyword.lower())*3\n",
    "        return score"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 1.78 s\n"
     ]
    }
   ],
   "source": [
    "%time searcher_v1 = MySearcherC7V1()"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "outputs": [
    {
     "data": {
      "text/plain": "[[6, 235],\n [99, 131],\n [75, 116],\n [115, 102],\n [24, 99],\n [214, 98],\n [1, 93],\n [183, 90],\n [98, 77],\n [172, 68],\n [203, 68],\n [196, 48],\n [200, 21]]"
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "searcher_v1.search('华为手机')"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1[235] 传华为预计今年智能手机出货量同比减少60%，降至7000万部\n",
      "2[131] 华为将在英国起诉汇丰，要拿到孟晚舟案关键文件\n",
      "3[116] 华为推\"智慧养猪\"，任正非曾称如果养猪可能也是状元\n",
      "4[102] 华为推“智慧养猪”，任正非：华为不靠手机也能活\n",
      "5[99] 华为折叠屏手机华为MateX2将于下周一20时发布\n",
      "6[98] 华为Mate X2镜头供应商为舜宇光学和欧菲光 屏幕为三星\n",
      "7[93] 华为供应链公司：已向华为P50手机供货，供货时间有延后\n",
      "8[90] 小米正式发布隔空充电技术 雷军称可实现单设备5瓦远距离充电\n",
      "9[77] 除了欢迎拜登致电华为，任正非还谈了孟晚舟、退休时间、5G转让等\n",
      "10[68] 外媒：夺回华为失去的市场，荣耀仍能重现辉煌\n",
      "11[68] 争国内第一！荣耀想打败华为小米，靠3599的V40行么？\n",
      "12[48] 小米国内机型不再支持自行安装GMS框架，国际版不受影响\n",
      "13[21] 男子年会中奖“清空购物车”，不料里面有套房，老板：过分了\n"
     ]
    }
   ],
   "source": [
    "searcher_v1.render_search_result('华为手机')"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}